This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
# Step 1: Load the dataset
crime_data <- read.csv("Crimes_-_2001_to_Present_20241205.csv")
summary(crime_data)
       ID           Case.Number            Date              Block               IUCR           Primary.Type       Description       
 Min.   :     634   Length:8212899     Length:8212899     Length:8212899     Length:8212899     Length:8212899     Length:8212899    
 1st Qu.: 3966344   Class :character   Class :character   Class :character   Class :character   Class :character   Class :character  
 Median : 7352976   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 7355141                                                                                                                    
 3rd Qu.:10667728                                                                                                                    
 Max.   :13681835                                                                                                                    
                                                                                                                                     
 Location.Description    Arrest            Domestic              Beat         District         Ward        Community.Area  
 Length:8212899       Length:8212899     Length:8212899     Min.   : 111   Min.   : 1.0   Min.   : 1.0     Min.   : 0.0    
 Class :character     Class :character   Class :character   1st Qu.: 621   1st Qu.: 6.0   1st Qu.:10.0     1st Qu.:23.0    
 Mode  :character     Mode  :character   Mode  :character   Median :1034   Median :10.0   Median :23.0     Median :32.0    
                                                            Mean   :1184   Mean   :11.3   Mean   :22.8     Mean   :37.4    
                                                            3rd Qu.:1731   3rd Qu.:17.0   3rd Qu.:34.0     3rd Qu.:56.0    
                                                            Max.   :2535   Max.   :31.0   Max.   :50.0     Max.   :77.0    
                                                                           NA's   :47     NA's   :614830   NA's   :613456  
   FBI.Code          X.Coordinate      Y.Coordinate          Year       Updated.On           Latitude       Longitude     
 Length:8212899     Min.   :      0   Min.   :      0   Min.   :2001   Length:8212899     Min.   :36.62   Min.   :-91.69  
 Class :character   1st Qu.:1153039   1st Qu.:1859167   1st Qu.:2005   Class :character   1st Qu.:41.77   1st Qu.:-87.71  
 Mode  :character   Median :1166176   Median :1890924   Median :2010   Mode  :character   Median :41.86   Median :-87.67  
                    Mean   :1164632   Mean   :1885878   Mean   :2011                      Mean   :41.84   Mean   :-87.67  
                    3rd Qu.:1176391   3rd Qu.:1909391   3rd Qu.:2016                      3rd Qu.:41.91   3rd Qu.:-87.63  
                    Max.   :1205119   Max.   :1951622   Max.   :2024                      Max.   :42.02   Max.   :-87.52  
                    NA's   :90296     NA's   :90296                                       NA's   :90296   NA's   :90296   
   Location        
 Length:8212899    
 Class :character  
 Mode  :character  
                   
                   
                   
                   
head(crime_data$Date)
[1] "03/18/2015 12:00:00 PM" "12/20/2018 03:00:00 PM" "05/01/2016 12:25:00 AM" "12/20/2018 04:00:00 PM" "06/01/2014 12:01:00 AM"
[6] "09/01/2018 12:01:00 AM"
nrow(crime_data)
[1] 8212899
limited_crime_data = crime_data
head(limited_crime_data$Date)
[1] "03/18/2015 12:00:00 PM" "12/20/2018 03:00:00 PM" "05/01/2016 12:25:00 AM" "12/20/2018 04:00:00 PM" "06/01/2014 12:01:00 AM"
[6] "09/01/2018 12:01:00 AM"
limited_crime_data$Date <- as.Date(limited_crime_data$Date, format = "%m/%d/%Y")


filtered_data <- limited_crime_data %>%
  filter(between(Date, as.Date("2001-01-01"),
         as.Date("2015-04-01")))

summary(filtered_data)
       ID           Case.Number             Date               Block               IUCR           Primary.Type       Description       
 Min.   :     634   Length:5779677     Min.   :2001-01-01   Length:5779677     Length:5779677     Length:5779677     Length:5779677    
 1st Qu.: 3123052   Class :character   1st Qu.:2003-12-28   Class :character   Class :character   Class :character   Class :character  
 Median : 5392262   Mode  :character   Median :2007-03-08   Mode  :character   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 5463237                      Mean   :2007-06-09                                                                              
 3rd Qu.: 7713084                      3rd Qu.:2010-09-06                                                                              
 Max.   :13679478                      Max.   :2015-04-01                                                                              
                                                                                                                                       
 Location.Description    Arrest            Domestic              Beat         District          Ward        Community.Area  
 Length:5779677       Length:5779677     Length:5779677     Min.   : 111   Min.   : 1.00   Min.   : 1.0     Min.   : 0.0    
 Class :character     Class :character   Class :character   1st Qu.: 623   1st Qu.: 6.00   1st Qu.:10.0     1st Qu.:23.0    
 Mode  :character     Mode  :character   Mode  :character   Median :1111   Median :10.00   Median :22.0     Median :32.0    
                                                            Mean   :1200   Mean   :11.32   Mean   :22.6     Mean   :37.7    
                                                            3rd Qu.:1732   3rd Qu.:17.00   3rd Qu.:34.0     3rd Qu.:58.0    
                                                            Max.   :2535   Max.   :31.00   Max.   :50.0     Max.   :77.0    
                                                                           NA's   :46      NA's   :614776   NA's   :613452  
   FBI.Code          X.Coordinate      Y.Coordinate          Year       Updated.On           Latitude       Longitude     
 Length:5779677     Min.   :      0   Min.   :      0   Min.   :2001   Length:5779677     Min.   :36.62   Min.   :-91.69  
 Class :character   1st Qu.:1152920   1st Qu.:1859151   1st Qu.:2003   Class :character   1st Qu.:41.77   1st Qu.:-87.71  
 Mode  :character   Median :1165913   Median :1890081   Median :2007   Mode  :character   Median :41.85   Median :-87.67  
                    Mean   :1164477   Mean   :1885617   Mean   :2007                      Mean   :41.84   Mean   :-87.67  
                    3rd Qu.:1176336   3rd Qu.:1909421   3rd Qu.:2010                      3rd Qu.:41.91   3rd Qu.:-87.63  
                    Max.   :1205119   Max.   :1951622   Max.   :2015                      Max.   :42.02   Max.   :-87.52  
                    NA's   :53228     NA's   :53228                                       NA's   :53228   NA's   :53228   
   Location        
 Length:5779677    
 Class :character  
 Mode  :character  
                   
                   
                   
                   
nrow(filtered_data)
[1] 5779677
head(filtered_data)
filtered_data <-na.omit(filtered_data)
filtered_data$Date <- as.Date(filtered_data$Date, format = "%Y")
filtered_data$Date <- format(filtered_data$Date, "%Y")
filtered_data <- distinct(filtered_data)
head(filtered_data)

sampled_data <- filtered_data %>%  group_by(Date) %>% slice_sample(n = 1000, replace = FALSE)
nrow(sampled_data)
[1] 15000
head(sampled_data)



selected_features <- c("Location.Description", "Date", "District", "Ward",
                       "Longitude", "Latitude", "Primary.Type", "Description")
data_cleaned <- sampled_data %>% select(all_of(selected_features))

summary(data_cleaned)
 Location.Description     Date              District          Ward         Longitude         Latitude     Primary.Type      
 Length:15000         Length:15000       Min.   : 1.00   Min.   : 1.00   Min.   :-91.69   Min.   :36.62   Length:15000      
 Class :character     Class :character   1st Qu.: 6.00   1st Qu.:11.00   1st Qu.:-87.72   1st Qu.:41.77   Class :character  
 Mode  :character     Mode  :character   Median :10.00   Median :23.00   Median :-87.67   Median :41.86   Mode  :character  
                                         Mean   :11.33   Mean   :22.84   Mean   :-87.67   Mean   :41.84                     
                                         3rd Qu.:17.00   3rd Qu.:34.00   3rd Qu.:-87.63   3rd Qu.:41.91                     
                                         Max.   :25.00   Max.   :50.00   Max.   :-87.52   Max.   :42.02                     
 Description       
 Length:15000      
 Class :character  
 Mode  :character  
                   
                   
                   
nrow(data_cleaned)
[1] 15000

#Normalization
normal <-function(x) {
  (x -min(x))/(max(x)-min(x))  
}
data_cleaned <-as.data.frame(data_cleaned)

#Seperate features and targets
#features <- data_cleaned %>%
#  select(-Primary.Type, -Description, -Location.Description, -Date)
features <- data_cleaned %>%
  select(-Location.Description, -Date, -Description)
targets <- data_cleaned %>% select(Primary.Type, Description)

data_cleaned[3:6] <- as.data.frame(lapply(data_cleaned[,c(3,4,5,6)], normal))
norm_features <-data_cleaned
#norm_features <- as.data.frame(lapply(data_cleaned[,c(3,4,5,6)], normal))



# Divide data into training and testing split
set.seed(123)
ran <- sample(1:nrow(data_cleaned), 0.99 * nrow(data_cleaned)) 

#get training stuff
train_features <- norm_features[ran,]
#train_primary_type <- targets$Primary.Type[ran]
#train_description <- targets$Description[ran]

#get testing stuff
test_features <- norm_features[-ran, ]
#test_primary_type <- targets$Primary.Type[-ran]
#test_description <- targets$Description[-ran]

target_category <- data_cleaned[ran,6]
test_category <- data_cleaned[-ran,6]

summary(train_features)
 Location.Description     Date              District           Ward          Longitude         Latitude      Primary.Type      
 Length:14850         Length:14850       Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Length:14850      
 Class :character     Class :character   1st Qu.:0.2083   1st Qu.:0.2041   1st Qu.:0.9541   1st Qu.:0.9532   Class :character  
 Mode  :character     Mode  :character   Median :0.3750   Median :0.4490   Median :0.9656   Median :0.9694   Mode  :character  
                                         Mean   :0.4308   Mean   :0.4458   Mean   :0.9643   Mean   :0.9666                     
                                         3rd Qu.:0.6667   3rd Qu.:0.6735   3rd Qu.:0.9749   3rd Qu.:0.9787                     
                                         Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000                     
 Description       
 Length:14850      
 Class :character  
 Mode  :character  
                   
                   
                   
nrow(train_features)
[1] 14850
summary(test_features)
 Location.Description     Date              District           Ward          Longitude         Latitude      Primary.Type      
 Length:150           Length:150         Min.   :0.0000   Min.   :0.0000   Min.   :0.9227   Min.   :0.9318   Length:150        
 Class :character     Class :character   1st Qu.:0.1667   1st Qu.:0.1684   1st Qu.:0.9551   1st Qu.:0.9513   Class :character  
 Mode  :character     Mode  :character   Median :0.3750   Median :0.4082   Median :0.9671   Median :0.9682   Mode  :character  
                                         Mean   :0.4097   Mean   :0.4367   Mean   :0.9651   Mean   :0.9654                     
                                         3rd Qu.:0.6250   3rd Qu.:0.6735   3rd Qu.:0.9759   3rd Qu.:0.9781                     
                                         Max.   :1.0000   Max.   :1.0000   Max.   :0.9974   Max.   :0.9983                     
 Description       
 Length:150        
 Class :character  
 Mode  :character  
                   
                   
                   
nrow(test_features)
[1] 150
library(class)
#Euclidean Distance Function
euclid <- function(a, b) {
  sqrt(sum((a - b)^2))
}

#Nearest Neighbors algorithm
nearest_neighbors = function(x, obs, k, dist_func){
  dist = apply(x, 1, dist_func, obs) #apply along the rows 
  distances = sort(dist)[1:k]
  neighbor_list = which(dist %in% sort(dist)[1:k])
  return(list(neighbor_list, distances))
}

#take last function and create a predicted class based on nearest neighbors
knn_classifier = function(x,y){

  groups = table(x[,y])
  pred = groups[groups == max(groups)]
  return(pred)
}
classification_results <- data.frame(Prediction = character())
x = train_features[1:(nrow(train_features)),]


for (i in 1:nrow(test_features)) {
  #observation to be classified
  obs = test_features[i,]
  nearestNeigbhors = nearest_neighbors(x[,3:6], obs[,3:6],7, euclid)
  #print(nearestNeigbhors)
  ind = nearestNeigbhors[[1]]
  knn_classified = knn_classifier(x[ind,], 'Primary.Type')
  print(names(knn_classified))
  classification_results <- rbind(classification_results, Prediction = names(knn_classified))
  #print(classification_results)
  #print(obs[,7])
}
[1] "BATTERY"
[1] "THEFT"
[1] "BURGLARY"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "BURGLARY"
[1] "THEFT"
[1] "BATTERY"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT"                    "BATTERY"                    "BURGLARY"                   "DECEPTIVE PRACTICE"        
[5] "GAMBLING"                   "OFFENSE INVOLVING CHILDREN" "WEAPONS VIOLATION"         
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT"
[1] "NARCOTICS"
[1] "BURGLARY"  "NARCOTICS"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "BATTERY" "THEFT"  
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "ASSAULT"
[1] "BURGLARY" "THEFT"   
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "ASSAULT"                "BATTERY"                "CRIMINAL DAMAGE"        "CRIMINAL TRESPASS"      "MOTOR VEHICLE THEFT"   
[6] "NARCOTICS"              "PUBLIC PEACE VIOLATION"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "NARCOTICS"
[1] "THEFT"
[1] "THEFT"
[1] "BATTERY"
[1] "BATTERY"
[1] "OTHER OFFENSE"
[1] "CRIMINAL DAMAGE" "NARCOTICS"      
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "ASSAULT"         "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "BATTERY"  "BURGLARY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT"   "NARCOTICS"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BURGLARY"
[1] "BATTERY"   "NARCOTICS"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY" "ROBBERY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"         "CRIMINAL DAMAGE" "ROBBERY"        
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "ASSAULT"                    "OFFENSE INVOLVING CHILDREN" "THEFT"                     
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "THEFT"
[1] "NARCOTICS"
[1] "BURGLARY"  "NARCOTICS"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT"            "BATTERY"            "BURGLARY"           "DECEPTIVE PRACTICE" "HOMICIDE"           "OTHER OFFENSE"     
[7] "THEFT"             
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "BURGLARY"
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "CRIMINAL DAMAGE"
[1] "THEFT"
[1] "THEFT"
[1] "ASSAULT"                    "BURGLARY"                   "GAMBLING"                   "OFFENSE INVOLVING CHILDREN"
[5] "OTHER OFFENSE"              "THEFT"                      "WEAPONS VIOLATION"         
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "THEFT"
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "DECEPTIVE PRACTICE"  "HOMICIDE"            "MOTOR VEHICLE THEFT" "NARCOTICS"           "PROSTITUTION"        "ROBBERY"            
[7] "SEX OFFENSE"        
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "WEAPONS VIOLATION"
[1] "CRIMINAL DAMAGE"
[1] "NARCOTICS"
[1] "THEFT"
[1] "PROSTITUTION"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "DECEPTIVE PRACTICE" "THEFT"             
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "OTHER OFFENSE"
[1] "THEFT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "BATTERY"         "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "DECEPTIVE PRACTICE"
[1] "BATTERY"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "THEFT"
[1] "BATTERY"
[1] "THEFT"
[1] "NARCOTICS"
[1] "ROBBERY"
[1] "BATTERY"   "NARCOTICS"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "PROSTITUTION" "THEFT"       
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "BATTERY"
[1] "BATTERY"
[1] "OTHER OFFENSE" "THEFT"        
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "BATTERY"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"  "BURGLARY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT"
[1] "CRIMINAL DAMAGE"
[1] "BATTERY"             "MOTOR VEHICLE THEFT"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "THEFT"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "OTHER OFFENSE"
[1] "BATTERY"
[1] "ASSAULT"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "BATTERY"
[1] "NARCOTICS"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE" "THEFT"          
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "NARCOTICS"
[1] "NARCOTICS"
[1] "NARCOTICS" "THEFT"    
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BURGLARY"
[1] "BATTERY"
[1] "ASSAULT"                "BATTERY"                "CRIMINAL DAMAGE"        "MOTOR VEHICLE THEFT"    "NARCOTICS"             
[6] "PUBLIC PEACE VIOLATION" "THEFT"                 
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "THEFT"
[1] "ASSAULT" "THEFT"  
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "BATTERY"         "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
  number of columns of result, 1, is not a multiple of vector length 2 of arg 2
accuracy <- function(x){
  sum(diag(x)/(sum(rowSums(x)))) * 100
}
accuracy <- function(x,y){
  matches <- x == y
  accuracy <- sum(matches) / length(matches) * 100
  return(accuracy)
}

#tab <- table(classification_results,obs[1:5,7])
print("Accuracy")
[1] "Accuracy"
print(accuracy(classification_results,test_features[,7]))
[1] 18.66667
#indices based on first taxi dist and then euclid dist
#print off the sub-data frame without class label

#Unused Code, originally planned for further testing, but does not follow the methodology of the research paper.
cosine_distance <- function(a, b) {
  distance <- 1 - (sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
  return(distance)
}
encoded_location <- data_cleaned.matrix(~Location.Description - 1, data = data)
encoded_type <- data_cleaned.matrix(~Primary.Type - 1, data = data)
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpUaGlzIGlzIGFuIFtSIE1hcmtkb3duXShodHRwOi8vcm1hcmtkb3duLnJzdHVkaW8uY29tKSBOb3RlYm9vay4gV2hlbiB5b3UgZXhlY3V0ZSBjb2RlIHdpdGhpbiB0aGUgbm90ZWJvb2ssIHRoZSByZXN1bHRzIGFwcGVhciBiZW5lYXRoIHRoZSBjb2RlLiANCg0KVHJ5IGV4ZWN1dGluZyB0aGlzIGNodW5rIGJ5IGNsaWNraW5nIHRoZSAqUnVuKiBidXR0b24gd2l0aGluIHRoZSBjaHVuayBvciBieSBwbGFjaW5nIHlvdXIgY3Vyc29yIGluc2lkZSBpdCBhbmQgcHJlc3NpbmcgKkN0cmwrU2hpZnQrRW50ZXIqLiANCg0KYGBge3J9DQpgYGANCg0KDQpgYGB7cn0NCmxpYnJhcnkoZHBseXIpDQoNCiMgU3RlcCAxOiBMb2FkIHRoZSBkYXRhc2V0DQpjcmltZV9kYXRhIDwtIHJlYWQuY3N2KCJDcmltZXNfLV8yMDAxX3RvX1ByZXNlbnRfMjAyNDEyMDUuY3N2IikNCnN1bW1hcnkoY3JpbWVfZGF0YSkNCmhlYWQoY3JpbWVfZGF0YSREYXRlKQ0KbnJvdyhjcmltZV9kYXRhKQ0KYGBgDQoNCg0KYGBge3J9DQpsaW1pdGVkX2NyaW1lX2RhdGEgPSBjcmltZV9kYXRhDQoNCmBgYA0KDQoNCmBgYHtyfQ0KaGVhZChsaW1pdGVkX2NyaW1lX2RhdGEkRGF0ZSkNCg0KbGltaXRlZF9jcmltZV9kYXRhJERhdGUgPC0gYXMuRGF0ZShsaW1pdGVkX2NyaW1lX2RhdGEkRGF0ZSwgZm9ybWF0ID0gIiVtLyVkLyVZIikNCg0KDQpmaWx0ZXJlZF9kYXRhIDwtIGxpbWl0ZWRfY3JpbWVfZGF0YSAlPiUNCiAgZmlsdGVyKGJldHdlZW4oRGF0ZSwgYXMuRGF0ZSgiMjAwMS0wMS0wMSIpLA0KICAgICAgICAgYXMuRGF0ZSgiMjAxNS0wNC0wMSIpKSkNCg0Kc3VtbWFyeShmaWx0ZXJlZF9kYXRhKQ0KbnJvdyhmaWx0ZXJlZF9kYXRhKQ0KaGVhZChmaWx0ZXJlZF9kYXRhKQ0KYGBgDQpgYGB7cn0NCmZpbHRlcmVkX2RhdGEgPC1uYS5vbWl0KGZpbHRlcmVkX2RhdGEpDQpmaWx0ZXJlZF9kYXRhJERhdGUgPC0gYXMuRGF0ZShmaWx0ZXJlZF9kYXRhJERhdGUsIGZvcm1hdCA9ICIlWSIpDQpmaWx0ZXJlZF9kYXRhJERhdGUgPC0gZm9ybWF0KGZpbHRlcmVkX2RhdGEkRGF0ZSwgIiVZIikNCmZpbHRlcmVkX2RhdGEgPC0gZGlzdGluY3QoZmlsdGVyZWRfZGF0YSkNCmhlYWQoZmlsdGVyZWRfZGF0YSkNCg0Kc2FtcGxlZF9kYXRhIDwtIGZpbHRlcmVkX2RhdGEgJT4lICBncm91cF9ieShEYXRlKSAlPiUgc2xpY2Vfc2FtcGxlKG4gPSAxMDAwLCByZXBsYWNlID0gRkFMU0UpDQpucm93KHNhbXBsZWRfZGF0YSkNCmhlYWQoc2FtcGxlZF9kYXRhKQ0KYGBgDQoNCg0KYGBge3J9DQoNCg0KDQpzZWxlY3RlZF9mZWF0dXJlcyA8LSBjKCJMb2NhdGlvbi5EZXNjcmlwdGlvbiIsICJEYXRlIiwgIkRpc3RyaWN0IiwgIldhcmQiLA0KICAgICAgICAgICAgICAgICAgICAgICAiTG9uZ2l0dWRlIiwgIkxhdGl0dWRlIiwgIlByaW1hcnkuVHlwZSIsICJEZXNjcmlwdGlvbiIpDQpkYXRhX2NsZWFuZWQgPC0gc2FtcGxlZF9kYXRhICU+JSBzZWxlY3QoYWxsX29mKHNlbGVjdGVkX2ZlYXR1cmVzKSkNCg0Kc3VtbWFyeShkYXRhX2NsZWFuZWQpDQpucm93KGRhdGFfY2xlYW5lZCkNCg0KYGBgDQoNCg0KDQpgYGB7cn0NCg0KI05vcm1hbGl6YXRpb24NCm5vcm1hbCA8LWZ1bmN0aW9uKHgpIHsNCiAgKHggLW1pbih4KSkvKG1heCh4KS1taW4oeCkpICANCn0NCmRhdGFfY2xlYW5lZCA8LWFzLmRhdGEuZnJhbWUoZGF0YV9jbGVhbmVkKQ0KDQoNCmRhdGFfY2xlYW5lZFszOjZdIDwtIGFzLmRhdGEuZnJhbWUobGFwcGx5KGRhdGFfY2xlYW5lZFssYygzLDQsNSw2KV0sIG5vcm1hbCkpDQpub3JtX2ZlYXR1cmVzIDwtZGF0YV9jbGVhbmVkDQoNCg0KDQoNCiMgRGl2aWRlIGRhdGEgaW50byB0cmFpbmluZyBhbmQgdGVzdGluZyBzcGxpdA0Kc2V0LnNlZWQoMTIzKQ0KcmFuIDwtIHNhbXBsZSgxOm5yb3coZGF0YV9jbGVhbmVkKSwgMC45OSAqIG5yb3coZGF0YV9jbGVhbmVkKSkgDQoNCiNnZXQgdHJhaW5pbmcgc3R1ZmYNCnRyYWluX2ZlYXR1cmVzIDwtIG5vcm1fZmVhdHVyZXNbcmFuLF0NCg0KI2dldCB0ZXN0aW5nIHN0dWZmDQp0ZXN0X2ZlYXR1cmVzIDwtIG5vcm1fZmVhdHVyZXNbLXJhbiwgXQ0KDQoNCnRhcmdldF9jYXRlZ29yeSA8LSBkYXRhX2NsZWFuZWRbcmFuLDZdDQp0ZXN0X2NhdGVnb3J5IDwtIGRhdGFfY2xlYW5lZFstcmFuLDZdDQoNCnN1bW1hcnkodHJhaW5fZmVhdHVyZXMpDQpucm93KHRyYWluX2ZlYXR1cmVzKQ0Kc3VtbWFyeSh0ZXN0X2ZlYXR1cmVzKQ0KbnJvdyh0ZXN0X2ZlYXR1cmVzKQ0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeShjbGFzcykNCiNFdWNsaWRlYW4gRGlzdGFuY2UgRnVuY3Rpb24NCmV1Y2xpZCA8LSBmdW5jdGlvbihhLCBiKSB7DQogIHNxcnQoc3VtKChhIC0gYileMikpDQp9DQoNCiNOZWFyZXN0IE5laWdoYm9ycyBhbGdvcml0aG0NCm5lYXJlc3RfbmVpZ2hib3JzID0gZnVuY3Rpb24oeCwgb2JzLCBrLCBkaXN0X2Z1bmMpew0KICBkaXN0ID0gYXBwbHkoeCwgMSwgZGlzdF9mdW5jLCBvYnMpICNhcHBseSBhbG9uZyB0aGUgcm93cyANCiAgZGlzdGFuY2VzID0gc29ydChkaXN0KVsxOmtdDQogIG5laWdoYm9yX2xpc3QgPSB3aGljaChkaXN0ICVpbiUgc29ydChkaXN0KVsxOmtdKQ0KICByZXR1cm4obGlzdChuZWlnaGJvcl9saXN0LCBkaXN0YW5jZXMpKQ0KfQ0KDQojdGFrZSBsYXN0IGZ1bmN0aW9uIGFuZCBjcmVhdGUgYSBwcmVkaWN0ZWQgY2xhc3MgYmFzZWQgb24gbmVhcmVzdCBuZWlnaGJvcnMNCmtubl9jbGFzc2lmaWVyID0gZnVuY3Rpb24oeCx5KXsNCg0KICBncm91cHMgPSB0YWJsZSh4Wyx5XSkNCiAgcHJlZCA9IGdyb3Vwc1tncm91cHMgPT0gbWF4KGdyb3VwcyldDQogIHJldHVybihwcmVkKQ0KfQ0KYGBgDQoNCmBgYHtyfQ0KY2xhc3NpZmljYXRpb25fcmVzdWx0cyA8LSBkYXRhLmZyYW1lKFByZWRpY3Rpb24gPSBjaGFyYWN0ZXIoKSkNCmBgYA0KDQpgYGB7cn0NCnggPSB0cmFpbl9mZWF0dXJlc1sxOihucm93KHRyYWluX2ZlYXR1cmVzKSksXQ0KDQogDQpmb3IgKGkgaW4gMTpucm93KHRlc3RfZmVhdHVyZXMpKSB7DQoNCiAgb2JzID0gdGVzdF9mZWF0dXJlc1tpLF0NCiAgbmVhcmVzdE5laWdiaG9ycyA9IG5lYXJlc3RfbmVpZ2hib3JzKHhbLDM6Nl0sIG9ic1ssMzo2XSw3LCBldWNsaWQpDQogICNwcmludChuZWFyZXN0TmVpZ2Job3JzKQ0KICBpbmQgPSBuZWFyZXN0TmVpZ2Job3JzW1sxXV0NCiAga25uX2NsYXNzaWZpZWQgPSBrbm5fY2xhc3NpZmllcih4W2luZCxdLCAnUHJpbWFyeS5UeXBlJykNCiAgcHJpbnQobmFtZXMoa25uX2NsYXNzaWZpZWQpKQ0KICBjbGFzc2lmaWNhdGlvbl9yZXN1bHRzIDwtIHJiaW5kKGNsYXNzaWZpY2F0aW9uX3Jlc3VsdHMsIFByZWRpY3Rpb24gPSBuYW1lcyhrbm5fY2xhc3NpZmllZCkpDQogICNwcmludChjbGFzc2lmaWNhdGlvbl9yZXN1bHRzKQ0KICAjcHJpbnQob2JzWyw3XSkNCn0NCmFjY3VyYWN5IDwtIGZ1bmN0aW9uKHgpew0KICBzdW0oZGlhZyh4KS8oc3VtKHJvd1N1bXMoeCkpKSkgKiAxMDANCn0NCmBgYA0KDQoNCmBgYHtyfQ0KYWNjdXJhY3kgPC0gZnVuY3Rpb24oeCx5KXsNCiAgbWF0Y2hlcyA8LSB4ID09IHkNCiAgYWNjdXJhY3kgPC0gc3VtKG1hdGNoZXMpIC8gbGVuZ3RoKG1hdGNoZXMpICogMTAwDQogIHJldHVybihhY2N1cmFjeSkNCn0NCg0KDQpwcmludCgiQWNjdXJhY3kiKQ0KcHJpbnQoYWNjdXJhY3koY2xhc3NpZmljYXRpb25fcmVzdWx0cyx0ZXN0X2ZlYXR1cmVzWyw3XSkpDQoNCg0KDQpgYGANCg0KYGBge3J9DQoNCiNVbnVzZWQgQ29kZSwgb3JpZ2luYWxseSBwbGFubmVkIGZvciBmdXJ0aGVyIHRlc3RpbmcsIGJ1dCBkb2VzIG5vdCBmb2xsb3cgdGhlIG1ldGhvZG9sb2d5IG9mIHRoZSByZXNlYXJjaCBwYXBlci4NCmNvc2luZV9kaXN0YW5jZSA8LSBmdW5jdGlvbihhLCBiKSB7DQogIGRpc3RhbmNlIDwtIDEgLSAoc3VtKGEgKiBiKSAvIChzcXJ0KHN1bShhXjIpKSAqIHNxcnQoc3VtKGJeMikpKSkNCiAgcmV0dXJuKGRpc3RhbmNlKQ0KfQ0KZW5jb2RlZF9sb2NhdGlvbiA8LSBkYXRhX2NsZWFuZWQubWF0cml4KH5Mb2NhdGlvbi5EZXNjcmlwdGlvbiAtIDEsIGRhdGEgPSBkYXRhKQ0KZW5jb2RlZF90eXBlIDwtIGRhdGFfY2xlYW5lZC5tYXRyaXgoflByaW1hcnkuVHlwZSAtIDEsIGRhdGEgPSBkYXRhKQ0KDQoNCg0KDQpgYGANCg0KDQo=